{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "loading train texts: 100%|██████████| 18846/18846 [00:02<00:00, 6466.04it/s]\n",
      "parsing texts: 100%|██████████| 18846/18846 [00:01<00:00, 15727.93it/s]\n",
      "2024-06-19 22:44:06,347 - TopMost - Real vocab size: 10000\n",
      "2024-06-19 22:44:06,442 - TopMost - Real training size: 18846 \t avg length: 68.727\n",
      "loading word embeddings: 100%|██████████| 10000/10000 [00:02<00:00, 4655.48it/s]\n",
      "2024-06-19 22:44:35,116 - TopMost - number of found embeddings: 9869/10000\n"
     ]
    }
   ],
   "source": [
    "import topmost\n",
    "from topmost import RawDataset, Preprocess\n",
    "from sklearn.datasets import fetch_20newsgroups\n",
    "\n",
    "docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\n",
    "preprocess = Preprocess(vocab_size=10000)\n",
    "\n",
    "device = 'cuda' # or 'cpu'\n",
    "dataset = RawDataset(docs, preprocess, device=device)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|          | 0/200 [00:00<?, ?it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  2%|▏         | 4/200 [00:00<00:37,  5.24it/s]2024-06-19 22:47:21,750 - TopMost - Epoch: 005 loss: 603.633\n",
      "  4%|▍         | 9/200 [00:01<00:36,  5.23it/s]2024-06-19 22:47:22,705 - TopMost - Epoch: 010 loss: 587.783\n",
      "  7%|▋         | 14/200 [00:02<00:35,  5.23it/s]2024-06-19 22:47:23,661 - TopMost - Epoch: 015 loss: 579.386\n",
      " 10%|▉         | 19/200 [00:03<00:34,  5.23it/s]2024-06-19 22:47:24,617 - TopMost - Epoch: 020 loss: 573.435\n",
      " 12%|█▏        | 24/200 [00:04<00:33,  5.23it/s]2024-06-19 22:47:25,573 - TopMost - Epoch: 025 loss: 570.076\n",
      " 14%|█▍        | 29/200 [00:05<00:32,  5.23it/s]2024-06-19 22:47:26,529 - TopMost - Epoch: 030 loss: 565.652\n",
      " 17%|█▋        | 34/200 [00:06<00:31,  5.24it/s]2024-06-19 22:47:27,483 - TopMost - Epoch: 035 loss: 564.753\n",
      " 20%|█▉        | 39/200 [00:07<00:30,  5.23it/s]2024-06-19 22:47:28,439 - TopMost - Epoch: 040 loss: 563.109\n",
      " 22%|██▏       | 44/200 [00:08<00:29,  5.24it/s]2024-06-19 22:47:29,394 - TopMost - Epoch: 045 loss: 561.784\n",
      " 24%|██▍       | 49/200 [00:09<00:28,  5.24it/s]2024-06-19 22:47:30,349 - TopMost - Epoch: 050 loss: 558.923\n",
      " 27%|██▋       | 54/200 [00:10<00:27,  5.24it/s]2024-06-19 22:47:31,305 - TopMost - Epoch: 055 loss: 558.468\n",
      " 30%|██▉       | 59/200 [00:11<00:26,  5.23it/s]2024-06-19 22:47:32,261 - TopMost - Epoch: 060 loss: 559.410\n",
      " 32%|███▏      | 64/200 [00:12<00:26,  5.23it/s]2024-06-19 22:47:33,218 - TopMost - Epoch: 065 loss: 558.471\n",
      " 34%|███▍      | 69/200 [00:13<00:25,  5.23it/s]2024-06-19 22:47:34,175 - TopMost - Epoch: 070 loss: 557.579\n",
      " 37%|███▋      | 74/200 [00:14<00:24,  5.24it/s]2024-06-19 22:47:35,130 - TopMost - Epoch: 075 loss: 558.901\n",
      " 40%|███▉      | 79/200 [00:15<00:23,  5.23it/s]2024-06-19 22:47:36,087 - TopMost - Epoch: 080 loss: 555.215\n",
      " 42%|████▏     | 84/200 [00:16<00:22,  5.23it/s]2024-06-19 22:47:37,044 - TopMost - Epoch: 085 loss: 556.153\n",
      " 44%|████▍     | 89/200 [00:17<00:21,  5.22it/s]2024-06-19 22:47:38,002 - TopMost - Epoch: 090 loss: 556.447\n",
      " 47%|████▋     | 94/200 [00:17<00:20,  5.27it/s]2024-06-19 22:47:38,947 - TopMost - Epoch: 095 loss: 556.082\n",
      " 50%|████▉     | 99/200 [00:18<00:19,  5.30it/s]2024-06-19 22:47:39,890 - TopMost - Epoch: 100 loss: 555.615\n",
      " 52%|█████▏    | 104/200 [00:19<00:18,  5.29it/s]2024-06-19 22:47:40,834 - TopMost - Epoch: 105 loss: 556.640\n",
      " 55%|█████▍    | 109/200 [00:20<00:17,  5.30it/s]2024-06-19 22:47:41,778 - TopMost - Epoch: 110 loss: 554.898\n",
      " 57%|█████▋    | 114/200 [00:21<00:16,  5.31it/s]2024-06-19 22:47:42,721 - TopMost - Epoch: 115 loss: 554.223\n",
      " 60%|█████▉    | 119/200 [00:22<00:15,  5.30it/s]2024-06-19 22:47:43,665 - TopMost - Epoch: 120 loss: 555.583\n",
      " 62%|██████▏   | 124/200 [00:23<00:14,  5.29it/s]2024-06-19 22:47:44,610 - TopMost - Epoch: 125 loss: 555.232\n",
      " 64%|██████▍   | 129/200 [00:24<00:13,  5.29it/s]2024-06-19 22:47:45,555 - TopMost - Epoch: 130 loss: 555.783\n",
      " 67%|██████▋   | 134/200 [00:25<00:12,  5.28it/s]2024-06-19 22:47:46,503 - TopMost - Epoch: 135 loss: 553.663\n",
      " 70%|██████▉   | 139/200 [00:26<00:11,  5.28it/s]2024-06-19 22:47:47,450 - TopMost - Epoch: 140 loss: 555.202\n",
      " 72%|███████▏  | 144/200 [00:27<00:10,  5.28it/s]2024-06-19 22:47:48,397 - TopMost - Epoch: 145 loss: 555.462\n",
      " 74%|███████▍  | 149/200 [00:28<00:09,  5.27it/s]2024-06-19 22:47:49,346 - TopMost - Epoch: 150 loss: 554.323\n",
      " 77%|███████▋  | 154/200 [00:29<00:08,  5.28it/s]2024-06-19 22:47:50,294 - TopMost - Epoch: 155 loss: 554.762\n",
      " 80%|███████▉  | 159/200 [00:30<00:07,  5.27it/s]2024-06-19 22:47:51,243 - TopMost - Epoch: 160 loss: 553.590\n",
      " 82%|████████▏ | 164/200 [00:31<00:06,  5.29it/s]2024-06-19 22:47:52,188 - TopMost - Epoch: 165 loss: 554.341\n",
      " 84%|████████▍ | 169/200 [00:32<00:05,  5.28it/s]2024-06-19 22:47:53,134 - TopMost - Epoch: 170 loss: 554.091\n",
      " 87%|████████▋ | 174/200 [00:33<00:04,  5.28it/s]2024-06-19 22:47:54,080 - TopMost - Epoch: 175 loss: 553.778\n",
      " 90%|████████▉ | 179/200 [00:34<00:03,  5.28it/s]2024-06-19 22:47:55,026 - TopMost - Epoch: 180 loss: 553.442\n",
      " 92%|█████████▏| 184/200 [00:34<00:03,  5.29it/s]2024-06-19 22:47:55,971 - TopMost - Epoch: 185 loss: 553.638\n",
      " 94%|█████████▍| 189/200 [00:35<00:02,  5.29it/s]2024-06-19 22:47:56,917 - TopMost - Epoch: 190 loss: 553.713\n",
      " 97%|█████████▋| 194/200 [00:36<00:01,  5.29it/s]2024-06-19 22:47:57,862 - TopMost - Epoch: 195 loss: 552.761\n",
      "100%|█████████▉| 199/200 [00:37<00:00,  5.29it/s]2024-06-19 22:47:58,807 - TopMost - Epoch: 200 loss: 553.261\n",
      "100%|██████████| 200/200 [00:38<00:00,  5.26it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Topic 0: commandments contexts cor pauls christians argues cites phrases explicitly homosexuality contemporary gentiles wright biblical gentile\n",
      "Topic 1: sexual homosexual promiscuous marriage church sin partners cramer intercourse homosexuals homosexuality sex lesbian marriages heterosexual\n",
      "Topic 2: manuals obo printer commodore sale forsale shipping borland compatible panasonic spreadsheet toner cod baud monitor\n",
      "Topic 3: identity conventions operators abuses liable responsibilities pools columns mechanisms identification oreilly servers privacy originate frontier\n",
      "Topic 4: casualties mysteries tyre lebanon unified iraqi stars islands temperature bursts dewey lebanese britain iranian magnetic\n",
      "Topic 5: motherboard slots seagate mhz iisi connector vram card ide simms quadra adapter vlb cache pds\n",
      "Topic 6: chastity skepticism shameful intellect surrender pcx dortmund dxf gordon rainer download upload bmp autocad pointers\n",
      "Topic 7: proclaim disobedience prayed rejection persecution christ authoritative wicked lord praying psalms quran covenant imprisoned rebellion\n",
      "Topic 8: emulator exec console fonts monochrome server toolkits casey initialization clients config xterm dev font applications\n",
      "Topic 9: chastity shameful skepticism intellect dtmedin ingr medin catbyte surrender ssd resistor cooling solder voltage gordon\n",
      "Topic 10: clutch infringed morality sentence sabo regulated moral militia unconditional amendment constitution jsh arms coercion theism\n",
      "Topic 11: brewers orioles padres cardinals rockies mets sox dodgers streak phillies braves cincinnati diego angels marlins\n",
      "Topic 12: lebanese syrian israel lebanon occupying israeli civilians guns disarmed withdraw laws firearms crime syria handguns\n",
      "Topic 13: villages indianapolis plains moslems moslem memoirs tartars comprised succeeded administrative hartill revolt officer armies bobbs\n",
      "Topic 14: defensive defense clutch player defensively offensive alomar rbi fielding recchi season winfield players tying fielder\n",
      "Topic 15: msg baptism faith existence spiritually supernatural realm systemic atoms noring incarnation infj yeast dissemination spiritual\n",
      "Topic 16: vlb dos scsi ram interfaces cache cga esdi tcp winmarks port controller card ethernet ati\n",
      "Topic 17: disciples jesuss psalm jesus saint sins azerbaijanis fulfilled bosnia president shalt congregation azerbaijani decisions joshua\n",
      "Topic 18: lib cica undefined truetype ati winini symbol autoexecbat atm speedstar icon fonts screen configsys windows\n",
      "Topic 19: app wolverine bagged hulk det chi bos nyi phi saga edm pit stl tor nyr\n",
      "Topic 20: innings inning sanders martinez stephenson pitchers pitching pitcher pitched pitches larkin hitter williams pitch fielder\n",
      "Topic 21: jesus apostle heaven behold noble conception unto immaculate lord covenant guards scriptures thee eternal hebrews\n",
      "Topic 22: insurance stein washer wibbled orlando conservatives legion mlud ceiling kitchen exterior avengers gld arlington moncton\n",
      "Topic 23: manipulations pixel lookup multiply graphs ntsc photoshop defining filtering processing imported filters iris analysis data\n",
      "Topic 24: max biz vax brewers yesterdays scorer air nah mceachern pct padres oakland streak royals mullen\n",
      "Topic 25: playoffs overtime winnipeg goaltending scoring puck isles habs buffalo selanne goaltender goalies canucks quebec penalty\n",
      "Topic 26: casserole tlu gic truelove leftover chastity beauchaine intellect sank shameful skepticism bronx hut surrender pizza\n",
      "Topic 27: connector connects connect voltage connectors connecting resistor cable connections wires nec circuit pin portions ttl\n",
      "Topic 28: persian police francisco investigators spying illegally azerbaijan san semitism angeles secretly adl african confidential accused\n",
      "Topic 29: jpeg newsanswers format uploads ppm color msdos xloadimage compression compress newest gif rtfmmitedu ftpcicaindianaedu pixels\n",
      "Topic 30: mothers infants childhood infection recommendations workshop persons hiv routinely progression newsletter confidence africa volume emerging\n",
      "Topic 31: dietary vitamin nutrition drugs drug bloom wrist injuries biochemistry abs breast accidents diet distress marijuana\n",
      "Topic 32: flee weaver villages harris cooper wounded nagorno massacre survivors refugees enclave women concussion troops corpses\n",
      "Topic 33: umpire bike gant passengers eagles philly cox ride swing rode yanks scared espn fan yerevan\n",
      "Topic 34: nyr nyi stl edm phi bos chi tor det utica ott que acknowledgement cape binghamton\n",
      "Topic 35: playback antenna mir esa hst payload booster jenks commanded sts delta vehicle eclipse payloads servicing\n",
      "Topic 36: file united ammo founder firearms counsel congress neal myths prohibit rkba amendment coalition handgun district\n",
      "Topic 37: callback colormap static xcreatewindow widget handler width attributes button visual pixmap application specification xlib usr\n",
      "Topic 38: soil mariner viking venus uranus diesel saturn voyager ceased exploring martian journey temperature radar liquid\n",
      "Topic 39: risc scsi xor encrypt burst chip serial byte synchronous registers computed bits plaintext encrypted asynchronous\n",
      "Topic 40: jews turkish shaw ottoman muslims persecuted croatia muslim nazis aspirations empire nazi turkey diplomatic political\n",
      "Topic 41: stereo cassette sony amplifier obo charger rca amp amps cables vhf toyota shipping camcorder exterior\n",
      "Topic 42: reform officers package economy president deficit billion economic republican democracy russia commitment debt democratic bush\n",
      "Topic 43: computational tutorial wiley proceedings neural attendees publishers robotics reception biological ieee conference dunn biology abstract\n",
      "Topic 44: loving punishment koresh koreshs sinners countersteering misguided innocent guilty anger dog batf sinner religion arrogant\n",
      "Topic 45: fbi batf agents warrant raid siege koresh davidians standoff atf cops cellular grenades bds compound\n",
      "Topic 46: encryption intercepts enforcement americans agencies government transmits conducting chip escrow separately smuggling terrorists administrations telecommunications\n",
      "Topic 47: conforms bytes output entirety int buf program flush col filename flags input author char args\n",
      "Topic 48: bike mlud bikes wibbled countersteering riding lean carb pillion biker motorcycle ride kawasaki honda passenger\n",
      "Topic 49: amiga graphics shading nicfunetfi misc browse cad pub abstracts interpreter ghostscript phigs texture coordination executables\n"
     ]
    }
   ],
   "source": [
    "model = topmost.ProdLDA(dataset.vocab_size, num_topics=50)\n",
    "model = model.to(device)\n",
    "\n",
    "trainer = topmost.BasicTrainer(model, dataset, verbose=True)\n",
    "\n",
    "topic_top_words, doc_topic_dist = trainer.train()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "parsing texts: 100%|██████████| 2/2 [00:00<00:00, 23109.11it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[35  8]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "from topmost.preprocess import Preprocess\n",
    "\n",
    "new_docs = [\n",
    "    \"This is a document about space, including words like space, satellite, launch, orbit.\",\n",
    "    \"This is a document about Microsoft Windows, including words like windows, files, dos.\"\n",
    "]\n",
    "\n",
    "preprocess = Preprocess()\n",
    "new_parsed_docs, new_bow = preprocess.parse(new_docs, dataset.vocab)\n",
    "new_theta = trainer.test(torch.as_tensor(new_bow, device=device).float())\n",
    "print(new_theta.argmax(1))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "torch1.13py3.8",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.19"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
